; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s

; Test using saddr addressing mode of global_* flat atomic instructions.

; --------------------------------------------------------------------------------
; atomicrmw max
; --------------------------------------------------------------------------------

define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_max_saddr_i32_rtn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    v_mov_b32_e32 v2, v0
; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB0_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_max_saddr_i32_rtn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    v_mov_b32_e32 v2, v0
; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i32_rtn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v2, v0
; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst
  %cast.rtn = bitcast i32 %rtn to float
  ret float %cast.rtn
}

define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    v_mov_b32_e32 v2, v0
; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB1_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB1_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_max_saddr_i32_rtn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    v_mov_b32_e32 v2, v0
; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i32_rtn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v2, v0
; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst
  %cast.rtn = bitcast i32 %rtn to float
  ret float %cast.rtn
}

define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_max_saddr_i32_nortn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB2_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB2_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_max_saddr_i32_nortn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dword v5, v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_max_saddr_i32_nortn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB2_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst
  ret void
}

define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB3_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB3_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_max_saddr_i32_nortn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_max_saddr_i32_nortn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_max_i32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst
  ret void
}

define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_max_saddr_i64_rtn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v6, s3
; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB4_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v10, v4
; GFX9-NEXT:    v_mov_b32_e32 v9, v3
; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v0, v3
; GFX9-NEXT:    v_mov_b32_e32 v1, v4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_max_saddr_i64_rtn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v10, v4
; GFX10-NEXT:    v_mov_b32_e32 v9, v3
; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v0, v3
; GFX10-NEXT:    v_mov_b32_e32 v1, v4
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i64_rtn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v10, v4
; GFX11-NEXT:    v_mov_b32_e32 v9, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v0, v3
; GFX11-NEXT:    v_mov_b32_e32 v1, v4
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst
  %cast.rtn = bitcast i64 %rtn to <2 x float>
  ret <2 x float> %cast.rtn
}

define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v6, s3
; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v10, v4
; GFX9-NEXT:    v_mov_b32_e32 v9, v3
; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v0, v3
; GFX9-NEXT:    v_mov_b32_e32 v1, v4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_max_saddr_i64_rtn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v10, v4
; GFX10-NEXT:    v_mov_b32_e32 v9, v3
; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v0, v3
; GFX10-NEXT:    v_mov_b32_e32 v1, v4
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_max_saddr_i64_rtn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v10, v4
; GFX11-NEXT:    v_mov_b32_e32 v9, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v0, v3
; GFX11-NEXT:    v_mov_b32_e32 v1, v4
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst
  %cast.rtn = bitcast i64 %rtn to <2 x float>
  ret <2 x float> %cast.rtn
}

define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_max_saddr_i64_nortn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB6_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT:    v_mov_b32_e32 v6, v4
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v3
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB6_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_max_saddr_i64_nortn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT:    v_mov_b32_e32 v6, v4
; GFX10-NEXT:    v_mov_b32_e32 v5, v3
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_max_saddr_i64_nortn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT:    v_mov_b32_e32 v6, v4
; GFX11-NEXT:    v_mov_b32_e32 v5, v3
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst
  ret void
}

define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB7_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT:    v_mov_b32_e32 v6, v4
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v3
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_max_saddr_i64_nortn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT:    v_mov_b32_e32 v6, v4
; GFX10-NEXT:    v_mov_b32_e32 v5, v3
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_max_saddr_i64_nortn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT:    v_mov_b32_e32 v6, v4
; GFX11-NEXT:    v_mov_b32_e32 v5, v3
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst
  ret void
}

; --------------------------------------------------------------------------------
; atomicrmw min
; --------------------------------------------------------------------------------

define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_min_saddr_i32_rtn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    v_mov_b32_e32 v2, v0
; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB8_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB8_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_min_saddr_i32_rtn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    v_mov_b32_e32 v2, v0
; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i32_rtn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v2, v0
; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst
  %cast.rtn = bitcast i32 %rtn to float
  ret float %cast.rtn
}

define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    v_mov_b32_e32 v2, v0
; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB9_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB9_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_min_saddr_i32_rtn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    v_mov_b32_e32 v2, v0
; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i32_rtn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v2, v0
; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst
  %cast.rtn = bitcast i32 %rtn to float
  ret float %cast.rtn
}

define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_min_saddr_i32_nortn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB10_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB10_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_min_saddr_i32_nortn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dword v5, v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_min_saddr_i32_nortn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst
  ret void
}

define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB11_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB11_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_min_saddr_i32_nortn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_min_saddr_i32_nortn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_min_i32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst
  ret void
}

define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_min_saddr_i64_rtn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v6, s3
; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB12_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v10, v4
; GFX9-NEXT:    v_mov_b32_e32 v9, v3
; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB12_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v0, v3
; GFX9-NEXT:    v_mov_b32_e32 v1, v4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_min_saddr_i64_rtn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v10, v4
; GFX10-NEXT:    v_mov_b32_e32 v9, v3
; GFX10-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v0, v3
; GFX10-NEXT:    v_mov_b32_e32 v1, v4
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i64_rtn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v10, v4
; GFX11-NEXT:    v_mov_b32_e32 v9, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v0, v3
; GFX11-NEXT:    v_mov_b32_e32 v1, v4
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst
  %cast.rtn = bitcast i64 %rtn to <2 x float>
  ret <2 x float> %cast.rtn
}

define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v6, s3
; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB13_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v10, v4
; GFX9-NEXT:    v_mov_b32_e32 v9, v3
; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB13_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v0, v3
; GFX9-NEXT:    v_mov_b32_e32 v1, v4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_min_saddr_i64_rtn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v10, v4
; GFX10-NEXT:    v_mov_b32_e32 v9, v3
; GFX10-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v0, v3
; GFX10-NEXT:    v_mov_b32_e32 v1, v4
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_min_saddr_i64_rtn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v10, v4
; GFX11-NEXT:    v_mov_b32_e32 v9, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v0, v3
; GFX11-NEXT:    v_mov_b32_e32 v1, v4
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst
  %cast.rtn = bitcast i64 %rtn to <2 x float>
  ret <2 x float> %cast.rtn
}

define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_min_saddr_i64_nortn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB14_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT:    v_mov_b32_e32 v6, v4
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v3
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB14_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_min_saddr_i64_nortn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT:    v_mov_b32_e32 v6, v4
; GFX10-NEXT:    v_mov_b32_e32 v5, v3
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_min_saddr_i64_nortn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT:    v_mov_b32_e32 v6, v4
; GFX11-NEXT:    v_mov_b32_e32 v5, v3
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst
  ret void
}

define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB15_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT:    v_mov_b32_e32 v6, v4
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v3
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB15_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_min_saddr_i64_nortn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT:    v_mov_b32_e32 v6, v4
; GFX10-NEXT:    v_mov_b32_e32 v5, v3
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_min_saddr_i64_nortn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT:    v_mov_b32_e32 v6, v4
; GFX11-NEXT:    v_mov_b32_e32 v5, v3
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst
  ret void
}

; --------------------------------------------------------------------------------
; atomicrmw umax
; --------------------------------------------------------------------------------

define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umax_saddr_i32_rtn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    v_mov_b32_e32 v2, v0
; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB16_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB16_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_umax_saddr_i32_rtn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    v_mov_b32_e32 v2, v0
; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i32_rtn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v2, v0
; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst
  %cast.rtn = bitcast i32 %rtn to float
  ret float %cast.rtn
}

define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    v_mov_b32_e32 v2, v0
; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB17_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB17_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    v_mov_b32_e32 v2, v0
; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v2, v0
; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst
  %cast.rtn = bitcast i32 %rtn to float
  ret float %cast.rtn
}

define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umax_saddr_i32_nortn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB18_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB18_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_umax_saddr_i32_nortn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dword v5, v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i32_nortn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst
  ret void
}

define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB19_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB19_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_max_u32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst
  ret void
}

define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umax_saddr_i64_rtn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v6, s3
; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB20_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v10, v4
; GFX9-NEXT:    v_mov_b32_e32 v9, v3
; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB20_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v0, v3
; GFX9-NEXT:    v_mov_b32_e32 v1, v4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_umax_saddr_i64_rtn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v10, v4
; GFX10-NEXT:    v_mov_b32_e32 v9, v3
; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v0, v3
; GFX10-NEXT:    v_mov_b32_e32 v1, v4
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i64_rtn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v10, v4
; GFX11-NEXT:    v_mov_b32_e32 v9, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v0, v3
; GFX11-NEXT:    v_mov_b32_e32 v1, v4
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst
  %cast.rtn = bitcast i64 %rtn to <2 x float>
  ret <2 x float> %cast.rtn
}

define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v6, s3
; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB21_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v10, v4
; GFX9-NEXT:    v_mov_b32_e32 v9, v3
; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB21_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v0, v3
; GFX9-NEXT:    v_mov_b32_e32 v1, v4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v10, v4
; GFX10-NEXT:    v_mov_b32_e32 v9, v3
; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v0, v3
; GFX10-NEXT:    v_mov_b32_e32 v1, v4
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v10, v4
; GFX11-NEXT:    v_mov_b32_e32 v9, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v0, v3
; GFX11-NEXT:    v_mov_b32_e32 v1, v4
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst
  %cast.rtn = bitcast i64 %rtn to <2 x float>
  ret <2 x float> %cast.rtn
}

define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umax_saddr_i64_nortn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB22_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT:    v_mov_b32_e32 v6, v4
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v3
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB22_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_umax_saddr_i64_nortn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT:    v_mov_b32_e32 v6, v4
; GFX10-NEXT:    v_mov_b32_e32 v5, v3
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i64_nortn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT:    v_mov_b32_e32 v6, v4
; GFX11-NEXT:    v_mov_b32_e32 v5, v3
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst
  ret void
}

define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB23_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT:    v_mov_b32_e32 v6, v4
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v3
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB23_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT:    v_mov_b32_e32 v6, v4
; GFX10-NEXT:    v_mov_b32_e32 v5, v3
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT:    v_mov_b32_e32 v6, v4
; GFX11-NEXT:    v_mov_b32_e32 v5, v3
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst
  ret void
}

; --------------------------------------------------------------------------------
; atomicrmw umin
; --------------------------------------------------------------------------------

define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umin_saddr_i32_rtn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    v_mov_b32_e32 v2, v0
; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB24_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB24_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_umin_saddr_i32_rtn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    v_mov_b32_e32 v2, v0
; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i32_rtn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v2, v0
; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst
  %cast.rtn = bitcast i32 %rtn to float
  ret float %cast.rtn
}

define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    v_mov_b32_e32 v2, v0
; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB25_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB25_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    v_mov_b32_e32 v2, v0
; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    v_mov_b32_e32 v2, v0
; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v2
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst
  %cast.rtn = bitcast i32 %rtn to float
  ret float %cast.rtn
}

define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umin_saddr_i32_nortn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dword v5, v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB26_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB26_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_umin_saddr_i32_nortn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dword v5, v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i32_nortn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst
  ret void
}

define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB27_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v0
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB27_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dword v5, v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX10-NEXT:    v_mov_b32_e32 v5, v0
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v2, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_min_u32_e32 v4, v5, v1
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
; GFX11-NEXT:    v_mov_b32_e32 v5, v0
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst
  ret void
}

define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umin_saddr_i64_rtn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v6, s3
; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB28_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v10, v4
; GFX9-NEXT:    v_mov_b32_e32 v9, v3
; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB28_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v0, v3
; GFX9-NEXT:    v_mov_b32_e32 v1, v4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_umin_saddr_i64_rtn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v10, v4
; GFX10-NEXT:    v_mov_b32_e32 v9, v3
; GFX10-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v0, v3
; GFX10-NEXT:    v_mov_b32_e32 v1, v4
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i64_rtn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v10, v4
; GFX11-NEXT:    v_mov_b32_e32 v9, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v0, v3
; GFX11-NEXT:    v_mov_b32_e32 v1, v4
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst
  %cast.rtn = bitcast i64 %rtn to <2 x float>
  ret <2 x float> %cast.rtn
}

define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v6, s3
; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB29_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v10, v4
; GFX9-NEXT:    v_mov_b32_e32 v9, v3
; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB29_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v0, v3
; GFX9-NEXT:    v_mov_b32_e32 v1, v4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v10, v4
; GFX10-NEXT:    v_mov_b32_e32 v9, v3
; GFX10-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX10-NEXT:    v_mov_b32_e32 v0, v3
; GFX10-NEXT:    v_mov_b32_e32 v1, v4
; GFX10-NEXT:    ; return to shader part epilog
;
; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[3:4], v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v5, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v10, v4
; GFX11-NEXT:    v_mov_b32_e32 v9, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v8, v2, v10, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v7, v1, v9, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
; GFX11-NEXT:    v_mov_b32_e32 v0, v3
; GFX11-NEXT:    v_mov_b32_e32 v1, v4
; GFX11-NEXT:    ; return to shader part epilog
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst
  %cast.rtn = bitcast i64 %rtn to <2 x float>
  ret <2 x float> %cast.rtn
}

define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umin_saddr_i64_nortn:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB30_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT:    v_mov_b32_e32 v6, v4
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v3
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB30_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_umin_saddr_i64_nortn:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3]
; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT:    v_mov_b32_e32 v6, v4
; GFX10-NEXT:    v_mov_b32_e32 v5, v3
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i64_nortn:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3]
; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT:    v_mov_b32_e32 v6, v4
; GFX11-NEXT:    v_mov_b32_e32 v5, v3
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst
  ret void
}

define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX9-NEXT:    v_mov_b32_e32 v3, s3
; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, s2, v0
; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT:    s_mov_b64 s[0:1], 0
; GFX9-NEXT:  .LBB31_1: ; %atomicrmw.start
; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_wbinvl1
; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX9-NEXT:    v_mov_b32_e32 v6, v4
; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT:    v_mov_b32_e32 v5, v3
; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT:    s_cbranch_execnz .LBB31_1
; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
; GFX10-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_gl0_inv
; GFX10-NEXT:    buffer_gl1_inv
; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX10-NEXT:    v_mov_b32_e32 v6, v4
; GFX10-NEXT:    v_mov_b32_e32 v5, v3
; GFX10-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT:    s_andn2_b64 exec, exec, s[0:1]
; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b64 v[5:6], v0, s[2:3] offset:-128
; GFX11-NEXT:    v_add_co_u32 v7, s[0:1], s2, v0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
; GFX11-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v5, vcc
; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    buffer_gl0_inv
; GFX11-NEXT:    buffer_gl1_inv
; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
; GFX11-NEXT:    v_mov_b32_e32 v6, v4
; GFX11-NEXT:    v_mov_b32_e32 v5, v3
; GFX11-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT:    s_endpgm
  %zext.offset = zext i32 %voffset to i64
  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
  %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst
  ret void
}

attributes #0 = { argmemonly nounwind willreturn }
